R

R is ‘GNU S’, a freely available language and environment for statistical computing and graphics which provides a wide variety of statistical and graphical techniques

Download and install R (precompiled binary distribution) from UCLA http://cran.stat.ucla.edu, the nearest R mirror site http://cran.r-project.org/mirrors.html

R Studio

RStudio is a set of integrated development environment (IDE) designed to help you be more prudictive with R. It includes

Download and install RStudio from https://www.rstudio.com/products/rstudio/download

Calculate some numbers

1 + 2
## [1] 3
2^4
## [1] 16
log(16, 2)
## [1] 4
log(1000, 3)
## [1] 6.28771
choose(5, 2)
## [1] 10

Data types

1. Logical
relapse <- TRUE
typeof( relapse )
## [1] "logical"
relapse
## [1] TRUE
sbp <- c(160, 110, 105, 150, 170)
sbp > 120
## [1]  TRUE FALSE FALSE  TRUE  TRUE
diag <- ifelse( sbp > 120, "hypertension", "normal")
2. Number
ldl <- 100
typeof( ldl )
## [1] "double"
hdl <- 60
vldl <- 25
totalChol <- hdl + ldl + vldl
totalChol 
## [1] 185
3. Vector of numbers
cholesterol <- c(177, 193, 195, 209, 226)
typeof( cholesterol )
## [1] "double"
length( cholesterol )
## [1] 5
mean( cholesterol )
## [1] 200
max( cholesterol )
## [1] 226
4. Vector of characters
genotype <- c("AA", "AG", "GG")
typeof( genotype )
## [1] "character"
5. Factors
alleles <- c( "AG", "GG", "AA", "AA", "AA", "AA", "AA", "AG", "AG", "GG", "GG", "GG", "GG", "AA")
typeof( alleles )
## [1] "character"
alleles 
##  [1] "AG" "GG" "AA" "AA" "AA" "AA" "AA" "AG" "AG" "GG" "GG" "GG" "GG" "AA"
alleles <- factor( alleles )
levels( alleles )
## [1] "AA" "AG" "GG"
typeof( alleles )
## [1] "integer"
alleles
##  [1] AG GG AA AA AA AA AA AG AG GG GG GG GG AA
## Levels: AA AG GG
table( alleles )
## alleles
## AA AG GG 
##  6  3  5
6. Data Frames
genotype <- c("AA", "AA", "AA",  "AA", "AA", "GG", "GG", "GG", "GG")
mrna <- c(100, 90, 105, 87, 92, 20, 24, 35, 27)
ethnicity <- c("AFR", "AFR", "AFR", "AMR",  "AFR", "ASN", "EUR", "EUR", "EUR")
relapse <- c("yes", "yes", "yes", "no", "yes", "no", "no", "no", "no")
study <- data.frame( genotype, mrna, ethnicity, relapse)
dim( study )
## [1] 9 4
head( study )
##   genotype mrna ethnicity relapse
## 1       AA  100       AFR     yes
## 2       AA   90       AFR     yes
## 3       AA  105       AFR     yes
## 4       AA   87       AMR      no
## 5       AA   92       AFR     yes
## 6       GG   20       ASN      no

Load data from a file

# Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. 
# They describe characteristics of the cell nuclei present in the image. 
url <-"http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
pima <- read.table( url, sep=",")
colnames( pima ) <- c("pregnancy", "gtt", "dbp", "skinthickness", "insulin", "bmi", "familyhistory", "age", "diabetes")
head( pima )
##   pregnancy gtt dbp skinthickness insulin  bmi familyhistory age diabetes
## 1         6 148  72            35       0 33.6         0.627  50        1
## 2         1  85  66            29       0 26.6         0.351  31        0
## 3         8 183  64             0       0 23.3         0.672  32        1
## 4         1  89  66            23      94 28.1         0.167  21        0
## 5         0 137  40            35     168 43.1         2.288  33        1
## 6         5 116  74             0       0 25.6         0.201  30        0
dim( pima )
## [1] 768   9
table( pima$diabetes )
## 
##   0   1 
## 500 268

Box plot

boxplot( gtt ~ diabetes, 
         data = pima)

Box plot with added features

boxplot( gtt ~ diabetes, 
         data = pima, 
         boxwex = 0.3,
         main = "Pima Indians Diabetes Study",
         ylab = "Glucose concentration", 
         xlab = "Diabetes")

Two group comparison, glucose concentration in GTT

case <- pima$gtt[ pima$diabetes == 1]
control <- pima$gtt[ pima$diabetes == 0]
t.test( case, control ) 
## 
##  Welch Two Sample t-test
## 
## data:  case and control
## t = 13.752, df = 461.33, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  26.80786 35.74707
## sample estimates:
## mean of x mean of y 
##  141.2575  109.9800

Two group comparison, Diastolic blood pressure (DBP)

case <- pima$dbp[ pima$diabetes == 1]
control <- pima$dbp[ pima$diabetes == 0]
t.test( case, control ) 
## 
##  Welch Two Sample t-test
## 
## data:  case and control
## t = 1.7131, df = 471.31, p-value = 0.08735
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.388326  5.669580
## sample estimates:
## mean of x mean of y 
##  70.82463  68.18400

Box plot of DBP

boxplot( dbp ~ diabetes, 
         data = pima, 
         boxwex = 0.3,
         main = "Pima Indians Diabetes Study",
         ylab = "Diastolic blood pressure", 
         xlab = "Diabetes")

Logistic regression with a binary outcome

m1 <- glm( diabetes ~ pregnancy + gtt + dbp + bmi + familyhistory, 
           data=pima, 
           family = "binomial")
round( summary( m1 )$coef, 3) 
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)     -7.955      0.676 -11.771    0.000
## pregnancy        0.153      0.028   5.514    0.000
## gtt              0.035      0.003  10.213    0.000
## dbp             -0.012      0.005  -2.387    0.017
## bmi              0.085      0.014   6.006    0.000
## familyhistory    0.911      0.294   3.097    0.002

Forest plot R package

library("forestmodel")
## Loading required package: ggplot2
forest_model( m1 )

Population Stratification using PCA

# INSTALL REQUIRED R PACKAGES
#source("http://bioconductor.org/biocLite.R"); biocLite("SNPRelate"); 

# LOAD R PACKAGES
library(SNPRelate)
## Loading required package: gdsfmt
## SNPRelate -- supported by Streaming SIMD Extensions 2 (SSE2)
# PREPARE INPUT DATA
genofile = openfn.gds(snpgdsExampleFileName())

# PRUNE SNPs BASED ON LINKAGE DISEQUILIBRIUM (LD)
snpset = snpgdsLDpruning(genofile, ld.threshold=0.2); snpset.id = unlist(snpset)
## Hint: it is suggested to call `snpgdsOpen' to open a SNP GDS file instead of `openfn.gds'.
## SNP pruning based on LD:
## Excluding 365 SNPs on non-autosomes
## Excluding 1 SNP (monomorphic: TRUE, < MAF: NaN, or > missing rate: NaN)
## Working space: 279 samples, 8,722 SNPs
##     using 1 (CPU) core
##  Sliding window: 500000 basepairs, Inf SNPs
##  |LD| threshold: 0.2
## Chromosome 1: 75.84%, 543/716
## Chromosome 2: 73.05%, 542/742
## Chromosome 3: 74.38%, 453/609
## Chromosome 4: 73.67%, 414/562
## Chromosome 5: 76.68%, 434/566
## Chromosome 6: 74.87%, 423/565
## Chromosome 7: 75.42%, 356/472
## Chromosome 8: 71.52%, 349/488
## Chromosome 9: 77.64%, 323/416
## Chromosome 10: 74.12%, 358/483
## Chromosome 11: 77.63%, 347/447
## Chromosome 12: 76.58%, 327/427
## Chromosome 13: 76.45%, 263/344
## Chromosome 14: 76.60%, 216/282
## Chromosome 15: 76.34%, 200/262
## Chromosome 16: 72.30%, 201/278
## Chromosome 17: 73.91%, 153/207
## Chromosome 18: 73.68%, 196/266
## Chromosome 19: 83.33%, 100/120
## Chromosome 20: 71.62%, 164/229
## Chromosome 21: 76.19%, 96/126
## Chromosome 22: 77.59%, 90/116
## 6548 SNPs are selected in total.
# PEFORM PCA
pca = snpgdsPCA(genofile, 
                maf=0.05, 
                missing.rate=0.05, 
                snp.id=snpset.id, 
                num.thread=2)
## Hint: it is suggested to call `snpgdsOpen' to open a SNP GDS file instead of `openfn.gds'.
## Principal Component Analysis (PCA) on genotypes:
## Excluding 2,540 SNPs (non-autosomes or non-selection)
## Excluding 1,105 SNPs (monomorphic: TRUE, < MAF: 0.05, or > missing rate: 0.05)
## Working space: 279 samples, 5,443 SNPs
##     using 2 (CPU) cores
## PCA: the sum of all selected genotypes (0, 1 and 2) = 1526608
## Wed Jan 11 15:17:48 2017    (internal increment: 2684)
## 
[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed      
## Wed Jan 11 15:17:48 2017    Begin (eigenvalues and eigenvectors)
## Wed Jan 11 15:17:48 2017    Done.
# DRAW A PLOT
plot(pca$eigenvect[,2], pca$eigenvect[,1], 
     xlab="Principal Component 2",
     ylab="Principal Component 1", 
     type="n")
race = as.factor(read.gdsn( index.gdsn(genofile, index=c("sample.annot", "pop.group"))))
table( race )
## race
## CEU HCB JPT YRI 
##  92  47  47  93
points(pca$eigenvect[,2], pca$eigenvect[,1], col=race)
legend("bottomleft", 
       title="Population", 
       legend=levels(race), 
       text.col=1:nlevels(race))

Manhattan Plot

library(manhattanly)
## See example usage at http://sahirbhatnagar.com/manhattanly/
dim( HapMap )
## [1] 14412     8
head( HapMap )
##   CHR      BP         P        SNP ZSCORE EFFECTSIZE    GENE DISTANCE
## 1   1  937641 0.3353438  rs9697358 0.9634    -0.0946   ISG15     1068
## 2   1 1136887 0.2458571 rs34945898 1.1605    -0.0947 TNFRSF4        0
## 3   1 2116240 0.8232859 rs12034613 0.2233    -0.0741  FP7162        0
## 4   1 2310562 0.4932038  rs4648633 0.6852     0.0146   MORN1        0
## 5   1 2681715 0.6053916  rs4430271 0.5167     0.1234   MMEL1   127427
## 6   1 2917484 0.1944431  rs6685625 1.2975     0.1979  ACTRT2    10421
HapMap[ HapMap$P < 10^(-8), ]
##      CHR       BP           P        SNP ZSCORE EFFECTSIZE      GENE
## 4338   5 28801403 4.70701e-09 rs16898108 5.8572     2.0314 LOC729862
## 4339   5 29102367 9.99901e-09   rs610094 5.7307     1.2378  AK098570
## 4340   5 29214921 6.70501e-09 rs16898772 5.7982     0.3107  AK098570
## 4341   5 29691322 6.22801e-09 rs16899251 5.8105    -0.8520  AK098570
## 4342   5 29775399 5.02201e-09  rs6450718 5.8464     0.4985  AK098570
## 4343   5 29950601 9.86401e-09  rs2972799 5.7331    -0.5233  AK098570
## 4344   5 30019370 3.95101e-09  rs2195387 5.8862     0.6305  AK098570
## 4345   5 30064404 8.01901e-09 rs16899785 5.7681    -0.7904  AK098570
## 4346   5 30262253 6.75010e-10 rs16900007 6.1718     1.6960      CDH6
## 4347   5 30797178 3.41101e-09  rs1039325 5.9105     1.3266      CDH6
##      DISTANCE
## 4338   161330
## 4339    77057
## 4340     6700
## 4341   483101
## 4342   567178
## 4343   742380
## 4344   811149
## 4345   856183
## 4346   967299
## 4347   432374
manhattanly(HapMap, 
            snp = "SNP",
            gene = "GENE", 
            annotation1 = "ZSCORE", annotation2 = "EFFECTSIZE", 
            highlight = significantSNP)